In [1]:
import os, shutil, pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from keras.callbacks import TensorBoard
from keras.models import Sequential, Model, load_model
from keras.layers import Flatten, Dense, Dropout, RandomFlip, RandomRotation
from tensorflow.keras.layers import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.applications import ResNet50
from keras.layers import RandomZoom, GlobalAveragePooling2D
from tensorflow.keras.regularizers import l2
2024-05-14 11:43:33.768097: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
In [2]:
from PIL import Image
In [3]:
img_width = 224
img_height = 224
random_seed = 123
batch_size = 32
In [4]:
working_dir = '/Users/austensteinberg/Desktop/Machine Learning/ML_Final/'
In [5]:
geo_train = pathlib.Path(os.path.join(working_dir, 'train_dataset'))
geo_test = pathlib.Path(os.path.join(working_dir, 'test_dataset'))
In [6]:
def extract_meta(path):
    meta = []
    for file_path in path.glob('*'):
        #provide width, height 
        width, height = Image.open(file_path).size
        
        metadata.append({
            'country': path.name,
            'image_name' :file_path.name,
            'width': width,
            'height' : height,
            'size' :file_path.stat().st_size,
            'path' : file_path
        })
        
    return
In [7]:
import os
import pandas as pd

# Define the path to the folder containing the country folders
folder_path = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/compressed_dataset'

# Initialize empty lists to store metadata
all_metadata = {
    'country': [],
    'image_name': []
}

# Iterate through each country folder
for country_folder in os.listdir(folder_path):
    country_path = os.path.join(folder_path, country_folder)
    
    # Check if the item in the folder is indeed a directory
    if os.path.isdir(country_path):
        # Get a list of image files in the country folder
        image_files = [f for f in os.listdir(country_path) if f.endswith('.jpg') or f.endswith('.png')]
        
        # Add metadata for each image in the country folder
        for image in image_files:
            all_metadata['country'].append(country_folder)
            all_metadata['image_name'].append(image)

# Create a DataFrame from the metadata
df_geo_data = pd.DataFrame(all_metadata)

# Group by 'country' and count the frequency of 'image_name'
df_data_distribution = df_geo_data.groupby('country')['image_name'].count().reset_index().rename(columns={'image_name': 'frequency'})

# Print the DataFrame to see the data distribution
print(df_data_distribution)
            country  frequency
0             Aland          9
1           Albania         41
2    American Samoa         16
3           Andorra         13
4        Antarctica          1
..              ...        ...
119  United Kingdom       2484
120   United States      12014
121         Uruguay         57
122       Venezuela          1
123         Vietnam         15

[124 rows x 2 columns]
In [10]:
print(df_geo_data)
        country             image_name
0        Bhutan  canvas_1629262074.jpg
1        Bhutan  canvas_1629527767.jpg
2        Bhutan  canvas_1629551780.jpg
3        Bhutan  canvas_1629992395.jpg
4        Bhutan  canvas_1629687800.jpg
...         ...                    ...
49992   Iceland  canvas_1629971749.jpg
49993   Iceland  canvas_1629503620.jpg
49994   Iceland  canvas_1629521495.jpg
49995  Paraguay  canvas_1629908268.jpg
49996  Paraguay  canvas_1630216530.jpg

[49997 rows x 2 columns]
In [9]:
df_geo_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49997 entries, 0 to 49996
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   country     49997 non-null  object
 1   image_name  49997 non-null  object
dtypes: object(2)
memory usage: 781.3+ KB
In [17]:
import random
from PIL import Image
import plotly.graph_objects as go

# Select a random country
random_country = df_geo_data.sample(1)

# Get the path to a random image in the selected country
country_path = os.path.join(folder_path, country_folder).format(random_country['country'].iloc[0])
image_files = [f for f in os.listdir(country_path) if f.endswith('.jpg') or f.endswith('.png')]
random_image = random.choice(image_files)
image_path = os.path.join(country_path, random_image)

# Open the image using PIL
img = Image.open(image_path)

# Create a plotly figure to display the image
fig = go.Figure()
fig.add_trace(go.Image(z=img))
fig.update_layout(
    title="Image from the training dataset",
    template="plotly_white",
    width=900,
    height=450
)
fig.update_xaxes(title_text="Width")
fig.update_yaxes(title_text="Height")
fig.add_annotation(
    x=0.5,
    y=1.1,
    xref="paper",
    yref="paper",
    text="Country : {}".format(random_country['country'].iloc[0]),
    showarrow=False
)
fig.show()
In [8]:
print("The total number of images in the dataset are {:,} images.".format(df_data_distribution.frequency.sum()))
print("The average number of images per country is {:,} images.".format(int(df_data_distribution.frequency.mean())))
print("The median number of images per country is {:,} images.".format(int(df_data_distribution.frequency.median())))
print("25% of the countries have {:,} images or less, while 75% of the countries have {:,} images or more.".format(int(df_data_distribution.frequency.quantile(0.25)), int(df_data_distribution.frequency.quantile(0.75))))
print("The maximum and minimum number of images per country are {:,} and {:,} images respectively.".format(df_data_distribution.frequency.max(), df_data_distribution.frequency.min()))
The total number of images in the dataset are 49,997 images.
The average number of images per country is 403 images.
The median number of images per country is 81 images.
25% of the countries have 13 images or less, while 75% of the countries have 268 images or more.
The maximum and minimum number of images per country are 12,014 and 1 images respectively.
In [18]:
import plotly.express as px
from plotly.offline import init_notebook_mode, iplot
# Function to add custom styling to the plot
def add_plot_styling(fig):
    fig.update_layout(
        plot_bgcolor='rgba(0,0,0,0.05)',  # Set background color to transparent
    )
    return fig


fig = px.box(
    df_data_distribution, 
    x="frequency", 
    title="Distribution of Images per Country", 
    hover_data=["frequency"],
    points="all")

# change the dimensions of the figure
fig.update_layout(xaxis_title="Country", yaxis_title="Number of images", height=500)
fig = add_plot_styling(fig)

# Use iplot to display the graph in the Jupyter Notebook
iplot(fig)
In [22]:
import plotly.offline as pyo
import math
# Function to plot heatmap
def plot_heatmap(df, title_text, color_threshold=0.90):
    # Shaping Data to fit graph by finding the max size of the grid to fit all the images
    grid_size = math.ceil(math.sqrt(df.shape[0]))**2
    
    pad = '-'
    n_labels = df.country.to_list() + ([pad] * (grid_size - df.shape[0]))
    n_samples = df.frequency.to_list() + ([np.nan] * (grid_size - df.shape[0]))
    
    samples = np.split(np.array(n_samples), math.sqrt(grid_size))
    
    fig = go.Figure(data=go.Heatmap(
        z=samples,
        x0=0,
        dx=1,
        y0=0,
        dy=1,
        colorscale='Emrld',
        text=n_labels,
        hoverinfo='text',
    ))
    # Add labels within the heatmap
    for i, country in enumerate(df['country']):
        x_pos = i % int(math.sqrt(grid_size))
        y_pos = i // int(math.sqrt(grid_size))
        fig.add_annotation(
            x=x_pos,  # Center the label within the grid cell
            y=y_pos,
            text= "{}<br> Count: {}".format(n_labels[i].replace(' ', '<br>'), n_samples[i]),
            showarrow=False,
            font=dict(
                color='#474747' if n_samples[i] < df.frequency.quantile(color_threshold) else 'white',
                size=10),
        )
    
    # hide axis labels
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False, autorange='reversed')
    
    # change the dimensions of the figure
    fig.update_layout(
        title=title_text,
        height=1200,
        width=1200
    )
    fig = add_plot_styling(fig)
    
    # Use iplot to display the graph in the Jupyter Notebook
    pyo.iplot(fig)
In [28]:
plot_heatmap(df_data_distribution.sort_values(by='frequency', ascending=True), "Images per Country", color_threshold=0.98)
In [9]:
USA = df_geo_data[df_geo_data['country'] == 'United States'].sample(n=2000)
In [10]:
df_geo_data.merge(USA, on=['country', 'image_name'], how='left')
Out[10]:
country image_name
0 Bhutan canvas_1629262074.jpg
1 Bhutan canvas_1629527767.jpg
2 Bhutan canvas_1629551780.jpg
3 Bhutan canvas_1629992395.jpg
4 Bhutan canvas_1629687800.jpg
... ... ...
49992 Iceland canvas_1629971749.jpg
49993 Iceland canvas_1629503620.jpg
49994 Iceland canvas_1629521495.jpg
49995 Paraguay canvas_1629908268.jpg
49996 Paraguay canvas_1630216530.jpg

49997 rows × 2 columns

In [11]:
filtered_countries = df_data_distribution[(df_data_distribution.frequency >= 50) & (df_data_distribution.frequency <= 10000) ].country.unique()
In [12]:
cut_countries = df_data_distribution[(df_data_distribution.frequency < 50)].country.unique()
                
In [13]:
print(cut_countries)
['Aland' 'Albania' 'American Samoa' 'Andorra' 'Antarctica' 'Armenia'
 'Belarus' 'Bermuda' 'Bhutan' 'China' 'Costa Rica' 'Curacao'
 'Dominican Republic' 'Egypt' 'Faroe Islands' 'Gibraltar' 'Greenland'
 'Guam' 'Iraq' 'Isle of Man' 'Jersey' 'Lebanon' 'Luxembourg' 'Macao'
 'Madagascar' 'Martinique' 'Monaco' 'Montenegro' 'Mozambique' 'Myanmar'
 'Nepal' 'North Macedonia' 'Northern Mariana Islands' 'Pakistan'
 'Palestine' 'Paraguay' 'Pitcairn Islands' 'Puerto Rico' 'Qatar' 'Reunion'
 'San Marino' 'South Georgia and South Sandwich Islands' 'South Sudan'
 'Svalbard and Jan Mayen' 'Tanzania' 'US Virgin Islands' 'Venezuela'
 'Vietnam']
In [14]:
df_geo_data.merge(USA, on=['country', 'image_name'], how='left')
Out[14]:
country image_name
0 Bhutan canvas_1629262074.jpg
1 Bhutan canvas_1629527767.jpg
2 Bhutan canvas_1629551780.jpg
3 Bhutan canvas_1629992395.jpg
4 Bhutan canvas_1629687800.jpg
... ... ...
49992 Iceland canvas_1629971749.jpg
49993 Iceland canvas_1629503620.jpg
49994 Iceland canvas_1629521495.jpg
49995 Paraguay canvas_1629908268.jpg
49996 Paraguay canvas_1630216530.jpg

49997 rows × 2 columns

In [15]:
df_filtered_geo_data = df_geo_data[df_geo_data.country.isin(filtered_countries)]
df_filtered_distribution = df_filtered_geo_data.groupby('country')['image_name'].count().reset_index().rename(columns={'image_name': 'frequency'})

df_filtered_geo_data
Out[15]:
country image_name
20 Kenya canvas_1629925277.jpg
21 Kenya canvas_1629837897.jpg
22 Kenya canvas_1630278825.jpg
23 Kenya canvas_1629433275.jpg
24 Kenya canvas_1630286494.jpg
... ... ...
49990 Iceland canvas_1630075636.jpg
49991 Iceland canvas_1629452069.jpg
49992 Iceland canvas_1629971749.jpg
49993 Iceland canvas_1629503620.jpg
49994 Iceland canvas_1629521495.jpg

37408 rows × 2 columns

In [29]:
plot_heatmap(df_filtered_distribution.sort_values(by='frequency', ascending=True), "Pictures per Country (>= 50)", color_threshold=0.94)
In [24]:
df_filtered_distribution.count()
Out[24]:
country      75
frequency    75
dtype: int64
In [19]:
fig = px.box(
    df_filtered_distribution, 
    x="frequency", 
    title="Distribution of Images per Country (filtered)", 
    hover_data=["frequency"],
    points="all")

# change the dimensions of the figure
fig.update_layout(xaxis_title="Country", yaxis_title="Number of images", height=600)
fig = add_plot_styling(fig)

# Use iplot to display the graph in the Jupyter Notebook
iplot(fig)

Building Train Test¶

In [26]:
from sklearn.model_selection import train_test_split

# Path to the main folder containing images
main_folder = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/compressed_dataset'

# Paths to the train and test folders
train_folder = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/train'
test_folder = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/test'

# Create train and test folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)
In [48]:
# Set the path for training and validation directories
train_dir = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/train'
test_dir = '/Users/austensteinberg/Desktop/Machine Learning /ML_Final/test'

# Define parameters for image loading
batch_size = 32
img_height = 224
img_width = 224

# Load and preprocess images from the training directory
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)

# Load and preprocess images from the training directory for validation
val_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)
Found 34303 files belonging to 75 classes.
Using 27443 files for training.
Found 34303 files belonging to 75 classes.
Using 6860 files for validation.

First Model¶

In [50]:
# Load and preprocess images from the training directory
train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)

# Extract class names
class_names = train_ds.class_names


AUTOTUNE = tf.data.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

# Now, build your model
model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(class_names))  # Use class_names extracted earlier
])

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()
Found 34303 files belonging to 75 classes.
Using 27443 files for training.
Found 34303 files belonging to 75 classes.
Using 6860 files for validation.
Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ rescaling_1 (Rescaling)         │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_2 (Conv2D)               │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d_2 (MaxPooling2D)  │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_3 (Conv2D)               │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d_3 (MaxPooling2D)  │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten_1 (Flatten)             │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_4 (Dense)                 │ ?                      │   0 (unbuilt) │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_5 (Dense)                 │ ?                      │   0 (unbuilt) │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 0 (0.00 B)
 Trainable params: 0 (0.00 B)
 Non-trainable params: 0 (0.00 B)
In [36]:
model = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),  # Specify input shape here
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(len(class_names), activation='softmax')  # Add activation 'softmax' for multi-class classification
])

model.compile(
    optimizer='adam',
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),  # Changed 'from_logits=True' to 'False' because softmax is used
    metrics=['accuracy']
)

model.summary()
/Users/austensteinberg/anaconda3/lib/python3.11/site-packages/keras/src/layers/preprocessing/tf_data_layer.py:18: UserWarning:

Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.

Model: "sequential_1"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓
┃ Layer (type)                    ┃ Output Shape           ┃       Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩
│ rescaling_1 (Rescaling)         │ (None, 224, 224, 3)    │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_2 (Conv2D)               │ (None, 222, 222, 32)   │           896 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d_2 (MaxPooling2D)  │ (None, 111, 111, 32)   │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ conv2d_3 (Conv2D)               │ (None, 109, 109, 32)   │         9,248 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ max_pooling2d_3 (MaxPooling2D)  │ (None, 54, 54, 32)     │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ flatten_1 (Flatten)             │ (None, 93312)          │             0 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_2 (Dense)                 │ (None, 128)            │    11,944,064 │
├─────────────────────────────────┼────────────────────────┼───────────────┤
│ dense_3 (Dense)                 │ (None, 76)             │         9,804 │
└─────────────────────────────────┴────────────────────────┴───────────────┘
 Total params: 11,964,012 (45.64 MB)
 Trainable params: 11,964,012 (45.64 MB)
 Non-trainable params: 0 (0.00 B)
In [37]:
epochs = 5

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs
)
Epoch 1/5
2024-05-13 20:00:17.373984: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:29: Filling up shuffle buffer (this may take a while): 142 of 1000
2024-05-13 20:00:37.359068: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:29: Filling up shuffle buffer (this may take a while): 393 of 1000
2024-05-13 20:00:57.345997: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:29: Filling up shuffle buffer (this may take a while): 618 of 1000
2024-05-13 20:01:00.432207: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
654/654 ━━━━━━━━━━━━━━━━━━━━ 563s 778ms/step - accuracy: 0.0887 - loss: 3.8057 - val_accuracy: 0.1011 - val_loss: 3.6507
Epoch 2/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 506s 773ms/step - accuracy: 0.1101 - loss: 3.6267 - val_accuracy: 0.1019 - val_loss: 3.6517
Epoch 3/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 535s 818ms/step - accuracy: 0.1956 - loss: 3.2188 - val_accuracy: 0.0790 - val_loss: 3.9744
Epoch 4/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 545s 833ms/step - accuracy: 0.5347 - loss: 1.8118 - val_accuracy: 0.0570 - val_loss: 5.3174
Epoch 5/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 534s 815ms/step - accuracy: 0.8381 - loss: 0.6487 - val_accuracy: 0.0533 - val_loss: 7.8753
In [38]:
import matplotlib.pyplot as plt

# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
In [84]:
model.save('/Users/austensteinberg/Desktop/Machine Learning /ML_Final/model.h5')
WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `keras.saving.save_model(model)`. This file format is considered legacy. We recommend using instead the native Keras format, e.g. `model.save('my_model.keras')` or `keras.saving.save_model(model, 'my_model.keras')`. 
In [39]:
test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(img_height, img_width),
    batch_size=batch_size)

test_loss, test_acc = model.evaluate(test_ds)
print(f"Test accuracy: {test_acc}")
Found 11255 files belonging to 75 classes.
352/352 ━━━━━━━━━━━━━━━━━━━━ 91s 259ms/step - accuracy: 0.0387 - loss: 9.0903
Test accuracy: 0.038560640066862106

Second Model + Drop-Out + Regularization + Early Stopping¶

In [93]:
epochs = 5 
data_augmentation = tf.keras.Sequential([
    RandomFlip("horizontal"),
    RandomRotation(0.2),
])

# Incorporate data augmentation 
model = tf.keras.Sequential([
    data_augmentation,
    tf.keras.layers.Rescaling(1./255),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Conv2D(32, 3, activation='relu'),
    tf.keras.layers.MaxPooling2D(),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Adding Dropout
    tf.keras.layers.Dense(len(class_names), activation='softmax')
])

# Add dropout 
tf.keras.layers.Dropout(0.5),

from tensorflow.keras.regularizers import l2

# Add L2 regularization to the convolutional layers
tf.keras.layers.Conv2D(32, 3, activation='relu', kernel_regularizer=l2(0.01)),

from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(monitor='val_loss', patience=5)


model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False), 
    metrics=['accuracy']
             )
# Include early stopping in the fit function
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=[early_stopping]
)
Epoch 1/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 570s 869ms/step - accuracy: 0.0860 - loss: 4.0421 - val_accuracy: 0.1008 - val_loss: 3.6353
Epoch 2/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 612s 935ms/step - accuracy: 0.0998 - loss: 3.6902 - val_accuracy: 0.0943 - val_loss: 3.6349
Epoch 3/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 574s 877ms/step - accuracy: 0.1021 - loss: 3.6833 - val_accuracy: 0.1008 - val_loss: 3.6343
Epoch 4/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 602s 920ms/step - accuracy: 0.1031 - loss: 3.6654 - val_accuracy: 0.1008 - val_loss: 3.6316
Epoch 5/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 656s 1s/step - accuracy: 0.1096 - loss: 3.6476 - val_accuracy: 0.1008 - val_loss: 3.6334
In [94]:
import matplotlib.pyplot as plt

# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model 2 Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model 2 Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
In [ ]:
model.save('/Users/austensteinberg/Desktop/Machine Learning /ML_Final/model.h5')
In [98]:
test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=(img_height, img_width),
    batch_size=batch_size)

test_loss, test_acc = model.evaluate(test_ds)
print(f"Test accuracy: {test_acc}")
Found 11255 files belonging to 75 classes.
352/352 ━━━━━━━━━━━━━━━━━━━━ 92s 261ms/step - accuracy: 0.1007 - loss: 4.1985
Test accuracy: 0.1023545116186142

Model Resnet¶

In [103]:
# Load ResNet50 base model, pretrained on ImageNet, without the top classification layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(img_height, img_width, 3))
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
94765736/94765736 ━━━━━━━━━━━━━━━━━━━━ 2s 0us/step
In [104]:
# Freeze all the layers in the base model
for layer in base_model.layers:
    layer.trainable = False
In [105]:
# Add custom top layers to the base model
x = base_model.output
x = GlobalAveragePooling2D()(x)  # Add a global average pooling layer after the base model
x = Dense(1024, activation='relu')(x)  # Add a fully connected layer with 1024 units and ReLU activation
x = Dropout(0.5)(x)  # Add dropout for regularization
predictions = Dense(len(class_names), activation='softmax')(x)  # Output layer with softmax activation

# Define the complete model
model = Model(inputs=base_model.input, outputs=predictions)

model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
In [108]:
epochs = 5  
callbacks = [EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=epochs,
    callbacks=callbacks
)
Epoch 1/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 1281s 2s/step - accuracy: 0.0800 - loss: 3.9036 - val_accuracy: 0.0979 - val_loss: 3.6792
Epoch 2/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 1365s 2s/step - accuracy: 0.0983 - loss: 3.6964 - val_accuracy: 0.0933 - val_loss: 3.6680
Epoch 3/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 1382s 2s/step - accuracy: 0.1017 - loss: 3.6697 - val_accuracy: 0.0920 - val_loss: 3.6678
Epoch 4/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 1379s 2s/step - accuracy: 0.1054 - loss: 3.6287 - val_accuracy: 0.1017 - val_loss: 3.6674
Epoch 5/5
654/654 ━━━━━━━━━━━━━━━━━━━━ 1503s 2s/step - accuracy: 0.1171 - loss: 3.5885 - val_accuracy: 0.0904 - val_loss: 3.6744
In [109]:
# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model 4 Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model 4 Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
In [110]:
# Evaluate the model on the test dataset
test_loss, test_acc = model.evaluate(test_ds)
print(f"Test accuracy: {test_acc}")
352/352 ━━━━━━━━━━━━━━━━━━━━ 663s 2s/step - accuracy: 0.0960 - loss: 4.1996
Test accuracy: 0.09764549136161804
In [ ]:
# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model 4 Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model 4 Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

Resnet with L2 and randomSearch¶

In [79]:
def build_model(hp):
    base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(img_height, img_width, 3))
    
    # Freeze the layers of the base model
    for layer in base_model.layers:
        layer.trainable = False
    
    # Add custom top layers
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(
        units=hp.Int('units', min_value=512, max_value=1024, step=128),
        activation='relu',
        kernel_regularizer=l2(hp.Float('l2_regularization', min_value=1e-4, max_value=1e-2, sampling='LOG'))
    )(x)
    x = Dropout(hp.Float('dropout_rate', min_value=0.4, max_value=0.7, step=0.1))(x)
    predictions = Dense(len(class_names), activation='softmax')(x)
    
    model = Model(inputs=base_model.input, outputs=predictions)
    
    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model
In [80]:
from kerastuner.tuners import RandomSearch

tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=3,
    executions_per_trial=1,
    directory='model_tuning',
    project_name='ResNet50Tuning'
)
Reloading Tuner from model_tuning/ResNet50Tuning/tuner0.json
In [81]:
# Start the search
tuner.search(train_ds, validation_data=val_ds, epochs=3, callbacks=[EarlyStopping(monitor='val_loss', patience=3)])

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]
Trial 3 Complete [01h 22m 26s]
val_accuracy: 0.10302416235208511

Best val_accuracy So Far: 0.10302416235208511
Total elapsed time: 05h 35m 34s
/Users/austensteinberg/anaconda3/lib/python3.11/site-packages/keras/src/saving/saving_lib.py:418: UserWarning:

Skipping variable loading for optimizer 'adam', because it has 2 variables whereas the saved optimizer has 10 variables. 

In [82]:
test_loss, test_acc = best_model.evaluate(test_ds)
print(f"Test accuracy: {test_acc}")
653/653 ━━━━━━━━━━━━━━━━━━━━ 1017s 2s/step - accuracy: 0.0968 - loss: 3.6481
Test accuracy: 0.09401053935289383
In [84]:
# Assuming `tuner` is your RandomSearch object after the search has been completed.
trials = tuner.oracle.get_best_trials(num_trials=tuner.oracle.max_trials)

# Initialize lists to store the accuracy and loss values
val_accuracies = []
val_losses = []

for trial in trials:
    # Append the best value of the validation accuracy and validation loss from each trial
    val_accuracies.append(trial.metrics.get_best_value('val_accuracy'))
    val_losses.append(trial.metrics.get_best_value('val_loss'))

# Generate indices for x-axis
trial_indices = list(range(1, len(val_accuracies) + 1))
In [85]:
# Plotting the validation accuracies
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
plt.plot(trial_indices, val_accuracies, marker='o', color='b', label='Validation Accuracy')
plt.title('Validation Accuracy per Trial')
plt.xlabel('Trial')
plt.ylabel('Validation Accuracy')
plt.grid(True)
plt.xticks(trial_indices)
plt.legend()

# Plotting the validation losses
plt.subplot(1, 2, 2)
plt.plot(trial_indices, val_losses, marker='o', color='r', label='Validation Loss')
plt.title('Validation Loss per Trial')
plt.xlabel('Trial')
plt.ylabel('Validation Loss')
plt.grid(True)
plt.xticks(trial_indices)
plt.legend()

plt.tight_layout()
plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]: